For the data challenge I need to scrape the text file data from the mta website. After looking over the website and inspecting a couple of files, I noted some challenges to keep in mind when collecting and organizing the data:
import sqlite3 as sql
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import pandas as pd
import plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
mta_url = 'http://web.mta.info/developers/turnstile.html'
db_name = 'TURNSTILES.db'
table_name = 'thirteen'
headers = ["CA", "UNIT", "SCP", "STATION", "LINE", "DIVISION", "DATE", "TIME", "DESC", "ENTRIES", "EXITS"]
types = ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "DATE", "TIME", "TEXT", "INTEGER", "INTEGER"]
page_data = requests.get(mta_url)
pg_dat = page_data.content
content = BeautifulSoup(pg_dat, 'html.parser')
page_links = content.find_all("a", href=True)
t_links = {}
for link in page_links:
if re.match('^data/nyct/turnstile/turnstile.*', link['href']):
t_links[link.text] = 'http://web.mta.info/developers/' + link['href']
print len(t_links), 'links in full dictionary'
'''
get links for 2013
'''
def year_links(url_dict, year):
y_links = {}
url_dict = t_links
for key in url_dict:
key_year = key.split(',')[2].split()
if str(year) in key_year:
y_links[key] = url_dict[key]
return y_links
year = 2013
y_links = year_links(t_links, year)
print len(y_links), 'links in %d dictionary' % year
# the last few days of 2013 are in the first file of 2014
y_links['Saturday, January 04, 2014'] = t_links['Saturday, January 04, 2014']
'''
set up my database mta.db and create table thirteen
'''
def database_setup(db_name, table_name):
header_and_types = "id INTEGER PRIMARY KEY AUTOINCREMENT, " + ", ".join([headers[i] + ' ' +
types[i] for i in range(len(headers))])
global conn, cursor
conn = sql.connect(db_name)
cursor = conn.cursor()
create_string = 'CREATE TABLE IF NOT EXISTS '+ table_name + ' (' + header_and_types + ')'
cursor.execute(create_string)
return
# run this function to set up database, commit, and then close connection for now
database_setup(db_name, table_name)
conn.commit()
conn.close()
'''
create function to update format of older files - only older files will be passed to this function
'''
def update_format(row):
out_row = []
len_id = 3 # chunk of 3 ID variables want to keep & repeat
len_entry = 5 # chunk the entries within the currently longer row
cells = row.split(',')
id_vars = ",".join(cells[0:len_id])
# match to dataframe to get station, linename, division info (if available)
remote = cells[1]
booth = cells[0]
find_station = df.Station[df.Remote == remote][df.Booth == booth].values
station = find_station[0] if len(find_station) > 0 else 'NULL'
find_linename = df.Line[df.Remote == remote][df.Booth == booth].values
linename = find_linename[0] if len(find_linename) > 0 else 'NULL'
find_division = df.Division[df.Remote == remote][df.Booth == booth].values
division = find_division[0] if len(find_division) > 0 else 'NULL'
for entry in range(len_id, len(cells), len_entry):
# make date match format of newer files
date = datetime.strftime(datetime.strptime(cells[entry], "%m-%d-%y"), "%m/%d/%Y")
added_info = ",".join([station, linename, division, date])
cleaned_row = id_vars + ',' + added_info + ',' + ','.join(cells[entry + 1: entry + len_entry])
out_row.append(cleaned_row)
return out_row
'''
create function to add entry lines from text files and populate the database
'''
def add_entry(input_line, table_name):
values = input_line.split(',')
values = [j.strip() if j != 'NULL' else None for j in values] # for null values, replace with none
values.insert(0, None) # add empty value for id/primary key - auto populate
value_slots = ",".join(['?']*(len(headers)+1))
insert_string = 'INSERT INTO ' + table_name + ' VALUES(' + value_slots + ')'
if len(values) == 12:
cursor.execute(insert_string, values)
'''
create function to loop through the links in my dictionary and populate the database
'''
def populate_db(url_dict, db_name, table_name, outfile_name):
# read in .csv of station keys for old files
global df
df = pd.read_csv('turnstiles_remote_station_key.csv')
df.columns = ['Remote', 'Booth', 'Station', 'Line', 'Division']
with open(outfile_name, 'w') as outfile: # text file to track progress
ctr = 1
outfile.write('Populating MTA database table turnstiles on' + datetime.now().ctime() + '\n')
global conn, cursor
conn = sql.connect(db_name)
cursor = conn.cursor()
for key in url_dict:
url = url_dict[key]
data = requests.get(url)
header = next(data.iter_lines()).split(",")
lines = data.iter_lines()
if len(header) > 11:
for line in lines:
entries = update_format(line)
for entry in entries:
add_entry(entry, table_name)
else:
lines.next()
for line in lines:
if len(line) > 1:
add_entry(line, table_name)
outfile.write('populated data for file number ' + str(ctr) + ': ' + url_dict[key] + '\n')
print 'populated data for file number ' + str(ctr) + ': ' + url_dict[key]
ctr += 1
populate_db(y_links, db_name, table_name, 'downloading_2013_turnstile_data_02062016.txt')
conn.commit()
From initial exploration fo the data I have noticed a few issues that have the potential to create big problems for the analysis ( Note this is only what I've found so far ):
conn = sql.connect(db_name)
cursor = conn.cursor()
'''
query number of turnstiles by station
'''
num_stiles_query = "SELECT COUNT(DISTINCT SCP) as turnstiles, STATION, DIVISION, CA FROM thirteen " \
"WHERE STATION is not NULL GROUP BY STATION, "\
"DIVISION ORDER BY turnstiles DESC ;"
top_units = pd.read_sql_query(num_stiles_query, conn)
top_units[0:10]
penn_units = top_units['turnstiles'][top_units['STATION']== '34 ST-PENN STA'].sum()
time_sq_units = pd.concat([top_units['turnstiles'][top_units['STATION'] == '42 ST-TIMES SQ'],
top_units['turnstiles'][top_units['STATION'] == '42 ST-PA BUS TE']]).sum()
print 'The Times Square Station complex has {0} turnstiles, and the Penn Station complex has {1} turnstiles '.format(
*[time_sq_units, penn_units])
'''
cleanup
'''
del top_units, penn_units, time_sq_units
'''
Create new table to make future queries a little easier
'''
create_daily_table = '''CREATE TABLE daily as select * from (SELECT min(end), DATE, entryend, exitend, SCP, CA, DIVISION, STATION from
(SELECT MAX(TIME) as end, DATE, ENTRIES as entryend, EXITS as exitend, SCP, CA, DIVISION, STATION from
(SELECT * FROM thirteen WHERE DESC = 'REGULAR' AND ENTRIES >= 0 AND
EXITS >= 0 AND STATION IS NOT NULL GROUP BY STATION, DIVISION, SCP, CA, DATE, TIME)
GROUP BY DATE, STATION, DIVISION, CA, SCP
UNION
SELECT MIN(TIME) as start, DATE, ENTRIES as entrystart, EXITS as exitstart, SCP, CA, DIVISION, STATION from
(SELECT * FROM thirteen WHERE DESC = 'REGULAR' AND ENTRIES >= 0 AND
EXITS >= 0 AND STATION IS NOT NULL GROUP BY STATION, DIVISION, SCP, CA, DATE, TIME)
GROUP BY DATE, STATION, DIVISION, CA, SCP
ORDER BY STATION, SCP)
GROUP BY DATE, STATION, DIVISION, SCP ORDER BY STATION, SCP)
;'''
cursor.execute(create_daily_table)
'''
Query database and organize into dataframes
'''
aug1_query = "SELECT STATION, DIVISION, CA, DATE, SCP, entryend as ENTRIES, exitend as EXITS, " \
"entryend + exitend as BUSY from daily WHERE entryend >= 0 AND exitend >=0 " \
"AND DATE BETWEEN '08/01/2013' AND '08/02/2013' GROUP BY DATE, STATION, DIVISION, CA, SCP " \
"ORDER BY STATION, DIVISION,SCP, DATE ;"
## get specific turnstile info to get busiest turnstile
aug_df = pd.read_sql_query(aug1_query, conn)
aug_df['STATION'] = aug_df['STATION'].astype('category')
aug_df['DIVISION'] = aug_df['DIVISION'].astype('category')
aug_df['DATE'] = pd.to_datetime(aug_df['DATE'])
'''
Get turns/day for entries, exits, and turns & drop problem observations
'''
def calculate_differences(df, cols_to_diff):
for col in cols_to_diff:
new_col = col + '_DIFF'
df[new_col] = df.groupby(['STATION', 'DIVISION','SCP'])[col].diff()
df[new_col] = df[new_col].where(df[new_col] >= 0, 'NaN').astype('float')
df[new_col] = df[new_col].where(df[new_col] < 50000, 'NaN').astype('float')
df['DATE'] = df['DATE'] - timedelta(days=1)
return df
aug_df = calculate_differences(aug_df, ['ENTRIES', 'EXITS', 'BUSY'])
'''
sum entries and exits across all stations for 08/01/2013
'''
total_entries = aug_df[aug_df.DATE == '2013-08-01']['ENTRIES_DIFF'].sum()
print 'approximate number of entries on 08/01/2013: ', int(total_entries)
total_exits = aug_df[aug_df.DATE == '2013-08-01'].groupby(['DATE'])['EXITS_DIFF'].sum().values[0]
print 'approximate number of exits on 08/01/2013: ', int(total_exits)
print 'approximate number of total turns on 08/01/2013', int(total_entries + total_exits)
aug_df[aug_df['DATE'] == "2013-08-01"].groupby(['STATION',
'DIVISION'])['BUSY_DIFF'].sum().sort_values(ascending = False)[0:10]
Defining station as having a distinct name and division, the busiest station is Grand Central Station with a total turnstile count of 176,960 turns. However, I know that Penn Station and Times Square stations serve multiple divisions. So I am going to look at the business for both of these stations, combining across divisions (and in the case of Times Square, the other division has a different station name).
penn_busy = aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '34 ST-PENN STA']['BUSY_DIFF'].sum()
times_busy = aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '42 ST-TIMES SQ']['BUSY_DIFF'].sum()
times_busy += aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '42 ST-PA BUS TE']['BUSY_DIFF'].sum()
print 'Times Square had {0} total turns and Penn Station had {1} total turns'.format(*[int(times_busy), int(penn_busy)])
If I take into account the fact that these hub stations serve multiple divisions, Times Square is the busiest station with a total of 310,074 turns. Additionally, Penn Station has 265,265 turns, which is greater than the number of turns for Grand Central Station.
busy_turnstile = aug_df[aug_df['DATE'] == '2013-08-01'][aug_df['BUSY_DIFF'] == aug_df['BUSY_DIFF'].max()]
t_vals_print = [busy_turnstile['CA'].values[0], busy_turnstile['SCP'].values[0],busy_turnstile['STATION'].values[0],
busy_turnstile['BUSY_DIFF'].values[0]]
print 'the busiest turnstile was {0} {1} in station {2} with {3} turns'.format(*t_vals_print)
The busiest turnstile on 08/01/2013 was N063a 00-00-00 in 42 ST Port Authority with 11,845 turns.
'''
cleanup
'''
del aug_df
'''
query, pull in data frame, format categories
'''
all_days_query = "SELECT STATION, DIVISION, DATE, sum(entryend) as ENTRIES, sum(exitend) as EXITS, " \
"sum(entryend) + sum(exitend) as BUSY from daily GROUP BY DATE, " \
"STATION, DIVISION ORDER BY STATION, DIVISION ;"
days_df = pd.read_sql_query(all_days_query, conn)
days_df['STATION'] = days_df['STATION'].astype('category')
days_df['DIVISION'] = days_df['DIVISION'].astype('category')
days_df['DATE'] = pd.to_datetime(days_df['DATE'])
'''
take difference between days of 'busy' to get turns per day - grouped by station/division
'''
def calculate_differences_noSCP(df, cols_to_diff):
for col in cols_to_diff:
new_col = col + '_DIFF'
df[new_col] = df.groupby(['STATION', 'DIVISION'])[col].diff()
df[new_col] = df[new_col].where(df[new_col] >= 0, 'NaN').astype('float')
df[new_col] = df[new_col].where(df[new_col] < 800000, 'NaN').astype('float')
df['DATE'] = df['DATE'] - timedelta(days=1)
return df
days_df = days_df[days_df.DATE <= '2013-12-31'][days_df.DATE >= '2013-01-01'].reset_index()
days_df = calculate_differences_noSCP(days_df, ['ENTRIES', 'EXITS', 'BUSY'])
'''
sanity check: are my busiest stations reporting the highest number of turns per day?
'''
avg_diff = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].mean()
print avg_diff.sort_values(ascending = False)[0:10]
'''
get change in turns per day, and average change - grouped by station/division
'''
days_df['BUSY_DIFF2'] = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].diff()
avg_2nd_diff = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF2'].mean()
# which stations are declining?
print 'declining'
print avg_2nd_diff.sort_values()[0:10]
#growing?
print 'growing'
print avg_2nd_diff.sort_values(ascending = False)[0:10]
Based on my conceptualization of growth and decline within 2013 as average change in the rate of turns per day:
To look at growth and declines in usage over the course of 2013, I looked at the average rate of change of turns per day for each station. This gives me an idea of trends within the year, but this isn't the same as directly comparing average number of riders per station for 2012 and 2013. Therefore the changes I am seeing could reflect seasonal trends, construction projects, etc.
av_turns_all_station = days_df.groupby('DATE')['BUSY_DIFF'].sum()
# get average turns/day overall
av_turns = av_turns_all_station.mean()
# average turns per day 7,966,954
# about 8 million turns per day
z_turns = (av_turns_all_station - av_turns_all_station.mean())/(av_turns_all_station.std())
z_turns.sort_values()[0:10]
'''
compare each station to its own daily average
'''
zscore = lambda x: (x - x.mean()) / x.std()
days_df['DAILYZ'] = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].transform(zscore)
'''
A station is experiencing low traffic if the number of turns is more than 2 standard deviations below the mean
'''
low_traffic = days_df[days_df['DAILYZ'] < -2]
'''
a station will be closed if the difference in busy-ness between days is 0
'''
closed = days_df[days_df['BUSY_DIFF'] == 0]
'''
To get an
'''
closed.sort_values(by = ['DAILYZ','DATE'])[0:10]
'''
Query table thirteen to get a count of all observations (including irregular that were previously ignored)
'''
row_count_query = "SELECT DATE, COUNT(DISTINCT id) as rows FROM thirteen WHERE " \
"DATE BETWEEN '07/01/2013' AND '09/30/2013' GROUP BY DATE ;"
row_count_df = pd.read_sql_query(row_count_query, conn)
row_count_df['DATE'] = pd.to_datetime(row_count_df['DATE'])
traces_row = []
traces_row.append(Scatter(x = row_count_df.DATE, y = row_count_df.rows, name='Number of Observations',
line = dict(color= 'rgb(106,81,163)', width = '4') ))
iplot({"data": traces_row,"layout": { 'title':"Number of Turnstile Observations Per Day in Q3 2013",
'xaxis': {'title': 'Date'},
'yaxis': {'title': 'Number of Observations'}}})
q3_df = days_df[days_df.DATE >= '2013-07-01'][days_df.DATE < '2013-10-01']
# del days_df
turns_day = q3_df.groupby('DATE')['ENTRIES_DIFF', 'EXITS_DIFF'].sum()
turns_day.index.name = 'date'
turns_day = turns_day.reset_index()
turns_day.columns = ['Date', 'Entries', 'Exits']
traces = []
traces.append(Bar(x = turns_day.Date, y = turns_day.Entries, name='Entries',
marker = dict(color= 'rgb(117,107,177)')))
traces.append(Bar(x = turns_day.Date, y = turns_day.Exits, name ='Exits',
marker=dict(color='rgb(188,189,220)')))
iplot({"data": traces,"layout": {'barmode':'stack',
'title':"Daily Entries and Exits for Q3 2013",
'xaxis': {'title': 'Date'},
'yaxis': {'title': 'Total Turns from Entries and Exits'}}})
penn_df = q3_df[q3_df['STATION'] == '34 ST-PENN STA']
penn_df['MONTH'] = penn_df['DATE'].apply(lambda x:datetime.strftime(x,'%B-%Y'))
penn_entry_av = penn_df.groupby('MONTH')['ENTRIES_DIFF'].mean().round(2)
penn_entry_std = penn_df.groupby('MONTH')['ENTRIES_DIFF'].std()
penn_entry_av.index.name = 'month'
penn_entry_av = penn_entry_av.reset_index()
penn_entry_av.columns = ['Month', 'Average']
penn_exit_av = penn_df.groupby('MONTH')['EXITS_DIFF'].mean().round(2)
penn_exit_std = penn_df.groupby('MONTH')['EXITS_DIFF'].std()
penn_exit_av.index.name = 'month'
penn_exit_av = penn_exit_av.reset_index()
penn_exit_av.columns = ['Month', 'Average']
month_names = ['July', 'August', 'September']
traces2 = []
traces2.append(Bar(x = month_names, y = penn_entry_av.Average, name = 'Avg Entries',
error_y = dict(type = 'data', array = penn_entry_std, visible = True),
marker = dict(color= 'rgb(117,107,177)')))
traces2.append(Bar(x = month_names, y = penn_exit_av.Average, name = 'Avg Exits',
error_y = dict(type = 'data', array = penn_exit_std, visible = True),
marker=dict(color='rgb(188,189,220)')))
layout = Layout(
title = 'Average of Daily Turns at 34 St Penn Station in Q3 2013',
xaxis = dict(title ='Month'),
yaxis= dict (title ='Average Daily Turns'))
py.offline.iplot({
'data' : traces2,
'layout': layout})
traces3 = []
traces3.append(Box(x = penn_df['MONTH'], y = penn_df['ENTRIES_DIFF'], name = 'Entries',
marker = dict(color= 'rgb(117,107,177)')))
traces3.append(Box(x = penn_df['MONTH'], y = penn_df['EXITS_DIFF'], name = 'Exits',
marker=dict(color='rgb(188,189,220)')))
layout = Layout(
title = 'Monthly Percentiles of Daily Turns at 34 St Penn Station in Q3 2013',
xaxis = dict(
title = 'Month'),
yaxis= dict (title ='Average Daily Turns'),
boxmode = 'group')
py.offline.iplot({
'data' : traces3,
'layout': layout})
q3_df['DAILYZ'] = q3_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].transform(zscore)
'''
A station is experiencing low traffic if the number of turns is more than 1 standard deviations below the mean
'''
q3_df['LOW'] = q3_df['DAILYZ'] < -2
low_traffic = q3_df.groupby(['DATE'])['LOW'].sum()
low_traffic.index.name = 'DATE'
low_traffic = low_traffic.reset_index()
low_traffic.columns = ['DATE', 'LOW']
'''
a station will be closed if the difference in busy-ness between days is 0
'''
q3_df['CLOSED'] = q3_df['BUSY_DIFF'] == 0
closed = q3_df.groupby(['DATE'])['CLOSED'].sum()
closed.index.name = 'DATE'
closed = closed.reset_index()
closed.columns = ['DATE', 'CLOSED']
traces4 = []
traces4.append(Bar(x = low_traffic.DATE, y = low_traffic.LOW, name='Below Capacity',
marker = dict(color= 'rgb(117,112,179)')))
traces4.append(Bar(x = closed.DATE, y = closed.CLOSED, name ='Closed',
marker=dict(color='rgb(217,95,2)')))
py.offline.iplot({"data": traces4,"layout": {'barmode':'group',
'title':"Stations Operating Below Capacity or Closed During Q3 2013",
'xaxis': {'title': 'Date'},
'yaxis': {'title': 'Number of Stations'}}})